##### ########################################################## ######
#####                                                            ######
#####   Input: debates/sentence-level dictionary scores          ######
#####   Output: Estimated stan models                            ######
#####                                                            ######
#####                                                            ######
##### ########################################################## ######

rm(list=ls())
range01 <- function(x){(x-min(x))/(max(x)-min(x))}

# Load libraries

library(data.table) # CRAN v.1.13.6
library(ggplot2) # CRAN v.3.3.3
library(rstan) # CRAN v.2.21.2
library(bayesplot) # CRAN v.1.7.2

# Options

rstan_options(auto_write = TRUE)
options(mc.cores = 3)

sigma_prior_variance <- 2
sigma_alpha_prior_variance <- 2
sigma_delta_prior_variance <- 2
sigma_gamma_prior_variance <- 2

vars <- c("affect_std", "posemo_std", "negemo_std", "fact_std", "anecdote_std", "aggression_std", "complexity_std", "repetition_std")

# Stan settings

N_iter <- 500
N_warm <- 250
N_chain <- 3
N_core <- parallel::detectCores() - 1
max_treedepth <- 10
adapt_delta <- 0.8

# Load data 

load("working/speech_scores.Rdata")

## Drop all debates with fewer than 5 participants

speech_scores[,keep:=.N > 4, by = section_id]
speech_scores <- speech_scores[keep == TRUE]

## Drop the speaker

speech_scores <- speech_scores[is_speaker == F]

## Drop all debates before Blair

speech_scores <- speech_scores[speech_scores$parliamentary_term != "1992-1997"]

## Gender 
  
speech_scores$person_id_new <- as.numeric(as.factor(speech_scores$person_id))

Ri_gender <- speech_scores[,list(gender = unique(gender == "Female")), by = person_id_new]
setkey(Ri_gender, person_id_new)
Ri_gender <- Ri_gender$gender

## Controls

X_ind <- speech_scores[,list(party = names(table(party))[order(table(party), decreasing = T)][1],
                             age = mean(age_years),
                             frontbench = any(is_gov_minister|is_opp_minister),
                             committee_chair = any(is_committee_chair),
                             margin = mean(margin),
                             has_degree = unique(has_degree),
                             business = unique(occupation_class == "Business"),
                             manual = unique(occupation_class == "Manual"),
                             political = unique(occupation_class == "Political"),
                             professional = unique(occupation_class == "Professional")
                             ), by = person_id_new]



X_ind$party[!X_ind$party %in% c("Labour", "Conservative", "LibDem")] <- "Other"
X_ind$party <- factor(X_ind$party, levels = c("Conservative", "Labour", "LibDem", "Other"))

Ri_party <- X_ind$party # Party ID needed separately for interaction model
Ri_party <- as.numeric(Ri_party)

X_ind$age <- as.numeric(scale(X_ind$age))
X_ind$margin <- as.numeric(scale(X_ind$margin))

X_ind <- model.matrix(~ party + age + frontbench + committee_chair + margin + has_degree + business + manual + political + professional, data =X_ind)[,-1]

metadata <- list()
metadata$labels_X_ind <- c("Labour", "LibDem", "Other", "age", "frontbench", "committee_chair", "margin", "has_degree", "business", "manual", "political", "professional")

## Debate

R_debate_id <- as.numeric(as.factor(as.character(speech_scores$section_id)))


## IDs

speech_scores$person_parl_id <- paste0(speech_scores$person_id, "_",speech_scores$session)

speech_scores$person_parl_id <- as.numeric(as.factor(speech_scores$person_parl_id))

## Gender 

Ri_gender <- speech_scores[,list(gender = unique(gender == "Female")),by = person_parl_id]
setkey(Ri_gender, person_parl_id)
Ri_gender <- Ri_gender$gender

## Parliamentary term

speech_scores$session <- as.factor(speech_scores$session)
Ri_term <- speech_scores[,list(parliamentary_term = unique(session)),by = person_parl_id]
Ri_term$parliamentary_term <- droplevels(Ri_term$parliamentary_term)
setkey(Ri_term, person_parl_id)

## Individual-level controls

## Over time analysis

speech_scores$gov_mp <- FALSE
speech_scores$gov_mp[speech_scores$party == "Labour" & speech_scores$parliamentary_term %in% c("1997-2001", "2001-2005", "2005-2010")] <- TRUE
speech_scores$gov_mp[speech_scores$party %in% c("Conservative", "LibDem") & speech_scores$parliamentary_term %in% c("2010-2015")] <- TRUE
speech_scores$gov_mp[speech_scores$party %in% c("Conservative") & speech_scores$parliamentary_term %in% c("2015-2017","2017-2019")] <- TRUE

X_ind <- speech_scores[,list(party = names(table(party))[order(table(party), decreasing = T)][1],
                             gov_mp = names(table(gov_mp))[order(table(gov_mp), decreasing = T)][1],
                             age = mean(age_years),
                             frontbench = any(is_gov_minister|is_opp_minister),
                             committee_chair = any(is_committee_chair),
                             margin = mean(margin),
                             has_degree = unique(has_degree),
                             business = unique(occupation_class == "Business"),
                             manual = unique(occupation_class == "Manual"),
                             political = unique(occupation_class == "Political"),
                             professional = unique(occupation_class == "Professional")
                             ), by = person_parl_id]

X_ind$party[!X_ind$party %in% c("Labour", "Conservative", "LibDem")] <- "Other"
X_ind$party <- factor(X_ind$party, levels = c("Conservative", "Labour", "LibDem", "Other"))
X_ind$age <- as.numeric(scale(X_ind$age))
X_ind$margin <- as.numeric(scale(X_ind$margin))

setkey(X_ind, person_parl_id)
Ri_party <- as.numeric(X_ind$party)

X_ind <- model.matrix(~ party + gov_mp + age + frontbench + committee_chair + margin + has_degree + business + manual + political + professional, data =X_ind)[,-1]

metadata <- list()
metadata$labels_X_ind <- c("Labour", "LibDem", "Other","gov_mp", "age", "frontbench", "committee_chair", "margin", "has_degree", "business", "manual", "political", "professional")

## Debate

R_debate_id <- as.numeric(as.factor(as.character(speech_scores$section_id)))

standata <- list(N_obs = nrow(speech_scores),
                 N_MPs = length(unique(speech_scores$person_parl_id)),
                 N_terms = length(unique(Ri_term$parliamentary_term)),
                 N_debates = length(unique(R_debate_id)),
                 N_ind_covars = ncol(X_ind),
                 N_parties = max(Ri_party),
                 R_Y = speech_scores$posemo_std,
                 Ri_gender = as.numeric(Ri_gender),
                 Ri_party = Ri_party,
                 R_debate_id = R_debate_id,
                 X_ind = X_ind,
                 Ri_term = as.numeric(Ri_term$parliamentary_term),
                 R_mp_id = speech_scores$person_parl_id,
                 sigma_prior_variance = sigma_prior_variance,
                 sigma_alpha_prior_variance = sigma_alpha_prior_variance,
                 sigma_delta_prior_variance = sigma_delta_prior_variance)

save(standata, file = "working/standata_model2.Rdata")

session_labels <- levels(speech_scores$session)
session_labels <- gsub("-19|-20","-",substring(session_labels,3,10))

## ########
## Models
  

# Baseline model

source("stancode/model2.R")
  
stan_model2 <- stan_model(model_code = simple_mod_code)
  
for(v in vars){
  
  standata$R_Y <- as.numeric(as.data.frame(speech_scores[,v, with = FALSE])[,1])
  
  posterior <- sampling(stan_model2, 
                          data = standata, 
                          iter = N_iter, 
                          warmup = N_warm,
                          chains = N_chain, 
                          cores = N_core, 
                          control = list(max_treedepth = max_treedepth,
                                         adapt_delta = adapt_delta),
                          init = 0,
                          seed = 221186,
                          pars = c("alpha_raw"),
                          include = FALSE)  
  
    save(posterior, metadata, file = paste0("working/stan_out/model2/",v,"_posterior.Rdata"))  

}


# Model with controls
  
## Stan Models

source("stancode/model2_control.R")

stan_model2_control <- stan_model(model_code = simple_mod_code)

for(v in vars){
  
  standata$R_Y <- as.numeric(as.data.frame(speech_scores[,v, with = FALSE])[,1])
  
  posterior <- sampling(stan_model2_control, 
                          data = standata, 
                          iter = N_iter, 
                          warmup = N_warm,
                          chains = N_chain, 
                          cores = N_core, 
                          control = list(max_treedepth = max_treedepth,
                                         adapt_delta = adapt_delta),
                          init = 0,
                          seed = 221186,
                          pars = c("alpha_raw"),
                          include = FALSE)  
  
  
    save(posterior, metadata, file = paste0("working/stan_out/model2_control/",v,"_posterior.Rdata"))  
  
}

